Meteo Imputation
  • Library Docs

On this page

  • Correlation
  • Comparison Imputation methods
    • State of the art
    • Percentage improvement
    • Main plot
    • Table
    • Timeseries
  • Kalman Filter analysis
    • Gap len
    • Control
    • Gap in Other variables
    • Generic vs Specialized
    • Training
  • Extra results
    • Standard deviations
    • Gap distribution
    • Square Root Filter

Plotting for results

This notebook produces all results plots. It generates some gap in the data, fill with a method (filter, MDS …), compute metrics and then makes all relevant plots

%load_ext autoreload
%autoreload 2
import altair as alt
from meteo_imp.kalman.results import *
from meteo_imp.data import *
from meteo_imp.utils import *
import pandas as pd
import numpy as np
from pyprojroot import here
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import SVG, Image

from meteo_imp.kalman.results import _plot_timeseries, _get_labels
from functools import partial
from contextlib import redirect_stderr
import io

import polars as pl
from fastai.vision.data import get_grid
import cairosvg

the generation of a proper pdf is complex as altair_render doesn’t support XOffset, so plotsare first renderder to svg using vl-convert and then to pdf using cairosvg. However this last methods doesn’t support negative numbers …

Due to the high number of samples also cannot use the browser render in the notebook so using vl-convert to a png for the visualization in the notebook

import vl_convert as vlc
from pyprojroot import here
base_path_img = here("manuscript/Master Thesis - Evaluation of Kalman filter for meteorological time series imputation for Eddy Covariance applications - Simone Massaro/images/")
base_path_tbl = here("manuscript/Master Thesis - Evaluation of Kalman filter for meteorological time series imputation for Eddy Covariance applications - Simone Massaro/tables/")

base_path_img.mkdir(exist_ok=True), base_path_tbl.mkdir(exist_ok=True)

def save_show_plot(plot,
                   path,
                   altair_render=False # use altair render for pdf?
                  ):
    plt_json = plot.to_json()
    if not altair_render:
        svg_data = vlc.vegalite_to_svg(vl_spec=plt_json)
        with open(base_path_img / (path + ".svg"), 'w') as f:
            f.write(svg_data)

        cairosvg.svg2pdf(file_obj=open(base_path_img / (path + ".svg")), write_to=str(base_path_img / (path + ".pdf")))
    else:
        with redirect_stderr(io.StringIO()):
            plot.save(base_path_img / (path + ".pdf"))

    # render to image for displaying in notebook
    png_data = vlc.vegalite_to_png(vl_spec=plot.to_json(), scale=1)
    return Image(png_data)
reset_seed()
n_rep = 500
hai = pd.read_parquet(hai_big_path).reindex(columns=var_type.categories)
hai_era = pd.read_parquet(hai_era_big_path)
alt.data_transformers.disable_max_rows() # it is safe to do so as the plots are rendered using vl-convert and then showed as images
DataTransformerRegistry.enable('default')

Correlation

import matplotlib.pyplot as plt
import statsmodels.api as sm
def auto_corr_df(data, nlags=96):
    autocorr = {}
    for col in data.columns:
        autocorr[col] = sm.tsa.acf(data[col], nlags=nlags)
    return pd.DataFrame(autocorr)
auto_corr = auto_corr_df(hai).reset_index(names="gap_len").melt(id_vars="gap_len")
auto_corr.gap_len = auto_corr.gap_len / 2
auto_corr
gap_len variable value
0 0.0 TA 1.000000
1 0.5 TA 0.998595
2 1.0 TA 0.995814
3 1.5 TA 0.992141
4 2.0 TA 0.987630
... ... ... ...
868 46.0 TS 0.959680
869 46.5 TS 0.961116
870 47.0 TS 0.962085
871 47.5 TS 0.962551
872 48.0 TS 0.962480

873 rows × 3 columns

p = (alt.Chart(auto_corr).mark_line().encode(
    x = alt.X('gap_len', title="Gap length [h]", axis = alt.Axis(values= [12, 24, 36, 48])),
    y = alt.Y("value", title="correlation"),
    color=alt.Color("variable", scale=meteo_scale, title="Variable"),
    facet =alt.Facet('variable', columns=3, sort = meteo_scale.domain, title=None,
                     header = alt.Header(labelFontWeight="bold", labelFontSize=20))
)
    .properties(height=120, width=250)
    .resolve_scale(y='independent', x = 'independent')
    .pipe(plot_formatter))

save_show_plot(p, "temporal_autocorrelation")

axes = get_grid(1,1,1, figsize=(10,8))
sns.set(font_scale=1.25)
sns.heatmap(hai.corr(), annot=True,     vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200), ax=axes[0], square=True, cbar=True)
# axes[0].set(xlabel="Variable", ylabel="Variable", title="Inter-variable Correlation");
# size_old = plt.rcParams["axes.labelsize"]
# w_old = plt.rcParams["axes.labelweight"]
# plt.rcParams["axes.labelsize"] = 30
# plt.rcParams["axes.labelweight"] = 'bold'
plt.tight_layout()
plt.xticks(weight = 'bold')
plt.yticks(weight = 'bold')

with matplotlib.rc_context({"axes.labelsize": 30}):
    plt.savefig(base_path_img / "correlation.pdf")
    plt.show()

# plt.rcParams["axes.labelsize"] = size_old
# plt.rcParams["axes.labelweight"] = w_old

Comparison Imputation methods

base_path = here("analysis/results/trained_models")
def l_model(x, base_path=base_path): return torch.load(base_path / x)
models_var = pd.DataFrame.from_records([
    {'var': 'TA',    'model': l_model("TA_specialized_gap_6-336_v3_0.pickle",base_path)},
    {'var': 'SW_IN', 'model': l_model("SW_IN_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'LW_IN', 'model': l_model("LW_IN_specialized_gap_6-336_v1.pickle",base_path)},
    {'var': 'VPD',   'model': l_model("VPD_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'WS',    'model': l_model("WS_specialized_gap_6-336_v1.pickle",base_path)},
    {'var': 'PA',    'model': l_model("PA_specialized_gap_6-336_v3_0.pickle",base_path)},
    {'var': 'P',     'model': l_model("1_gap_varying_6-336_v3.pickle",base_path)},
    {'var': 'TS',    'model': l_model("TS_specialized_gap_6-336_v2_0.pickle",base_path)},
    {'var': 'SWC',   'model': l_model("SWC_specialized_gap_6-336_v2_1.pickle",base_path)},
])
@cache_disk(cache_dir / "the_results")
def get_the_results(n_rep=20):
    reset_seed()
    comp_Av = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, time_series=False)
    results_Av = comp_Av.compare(gap_len = [12,24, 48, 336], var=list(hai.columns), n_rep=n_rep) 
    return results_Av

results_Av = get_the_results(n_rep)

State of the art

the first plot is a time series using only state-of-the-art methods

reset_seed()
comp_ts = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 48+100, time_series=True, rmse=False)
results_ts = comp_ts.compare(gap_len = [48], var=list(hai.columns), n_rep=1) 
res_ts = results_ts.query("method != 'Kalman Filter'")
res_ts_plot = pd.concat([unnest_predictions(row, ctx_len=72) for _,row in res_ts.iterrows()])
scale_sota = alt.Scale(domain=["ERA-I", "MDS"], range=list(sns.color_palette('Dark2', 3).as_hex())[1:])
p = (facet_wrap(res_ts_plot, partial(_plot_timeseries, scale_color=scale_sota, err_band = False), col="var",
                y_labels = _get_labels(res_ts_plot, 'mean', None),
               )
            .pipe(plot_formatter)
    )
save_show_plot(p, "timeseries_sota", altair_render=True)

Percentage improvement

results_Av.method.unique()
['Kalman Filter', 'ERA-I', 'MDS']
Categories (3, object): ['Kalman Filter' < 'ERA-I' < 'MDS']
all_res = results_Av.query('var != "P"').groupby(['method']).agg({'rmse_stand': 'mean'}).T
all_res
method Kalman Filter ERA-I MDS
rmse_stand 0.204628 0.307361 0.482837

percentage of improvement across all variables

(all_res["ERA-I"] - all_res["Kalman Filter"]) / all_res["ERA-I"] * 100 
rmse_stand    33.42398
dtype: float64
(all_res["MDS"] - all_res["Kalman Filter"]) / all_res["MDS"] * 100 
rmse_stand    57.619542
dtype: float64
res_var = results_Av.groupby(['method', 'var']).agg({'rmse_stand': 'mean'}) 
res_var = res_var.reset_index().pivot(columns='method', values='rmse_stand', index='var')
pd.DataFrame({'ERA': (res_var["ERA-I"] - res_var["Kalman Filter"]) / res_var["ERA-I"] * 100, 'MDS': (res_var["MDS"] - res_var["Kalman Filter"]) / res_var["MDS"] * 100 })
ERA MDS
var
TA 54.540802 77.713711
SW_IN 12.004508 35.516142
LW_IN 5.166063 52.289627
VPD 44.402821 65.407769
WS 21.064305 40.321732
PA 28.784191 90.751559
P -18.544370 -22.084360
SWC NaN 41.543006
TS NaN 25.772326
res_var2 = results_Av.groupby(['method', 'var', 'gap_len']).agg({'rmse_stand': 'mean'}) 
res_var2 = res_var2.reset_index().pivot(columns='method', values='rmse_stand', index=['var', 'gap_len'])
pd.DataFrame({'ERA': (res_var2["ERA-I"] - res_var2["Kalman Filter"]) / res_var2["ERA-I"] * 100, 'MDS': (res_var2["MDS"] - res_var2["Kalman Filter"]) / res_var2["MDS"] * 100 })
ERA MDS
var gap_len
TA 6 69.897582 85.052698
12 58.766166 79.376385
24 51.538443 75.395970
168 41.823614 73.000401
SW_IN 6 9.519984 29.746651
12 11.165399 30.639223
24 14.232051 34.811941
168 12.305658 42.651906
LW_IN 6 21.023524 59.136518
12 9.110040 52.211404
24 -3.553292 50.720632
168 -4.260023 48.223005
VPD 6 66.980942 79.449579
12 47.785633 69.081018
24 33.663749 56.728120
168 32.272332 57.702579
WS 6 32.402977 45.724043
12 25.209162 43.275430
24 15.543672 37.142502
168 12.735569 36.436106
PA 6 39.823585 91.511486
12 30.995845 90.532461
24 24.727301 89.319180
168 20.691181 91.421434
P 6 -18.485009 -13.917879
12 -28.935358 -37.127331
24 -24.423076 -29.998707
168 -7.725322 -11.587796
SWC 6 NaN 61.302664
12 NaN 47.976950
24 NaN 42.535719
168 NaN 23.301469
TS 6 NaN 64.264901
12 NaN 46.699870
24 NaN 27.050291
168 NaN -15.268479

Main plot

from itertools import product
import altair as alt
p = the_plot(results_Av)
save_show_plot(p, "the_plot")

p = the_plot_stand(results_Av)
save_show_plot(p, "the_plot_stand")

Table

t = the_table(results_Av)
the_table_latex(t, base_path_tbl / "the_table.tex", label="tbl:the_table",
                caption="\\CapTheTable")
t
Kalman Filter ERA-I MDS
RMSE mean std mean std mean std
Variable Gap
TA 6 h 0.405453 0.258301 1.346910 0.997843 2.712546 1.896914
12 h 0.606836 0.400849 1.471695 0.900611 2.942435 1.748131
1 day (24 h) 0.741275 0.368468 1.529614 0.800256 3.012819 1.611311
1 week (168 h) 1.020608 0.444591 1.754334 0.643160 3.780087 1.315472
SW_IN 6 h 44.636609 40.464629 49.333113 66.241975 63.536627 85.401585
12 h 48.155186 33.868178 54.207691 49.769296 69.427115 68.936352
1 day (24 h) 56.564277 30.042752 65.950367 40.930505 86.770917 59.603564
1 week (168 h) 61.582820 25.740161 70.224393 34.883199 107.384249 53.606111
LW_IN 6 h 10.902409 7.736087 13.804628 12.987987 26.680077 15.022366
12 h 13.421656 7.734502 14.766929 12.584725 28.085478 13.457335
1 day (24 h) 14.593819 7.840046 14.093052 12.227900 29.614461 12.416763
1 week (168 h) 17.062880 6.425136 16.365697 11.129569 32.954558 8.833972
VPD 6 h 0.428187 0.363168 1.296787 1.547397 2.083592 2.149288
12 h 0.660623 0.504761 1.265213 1.288794 2.136626 2.095549
1 day (24 h) 0.827563 0.501975 1.247527 1.032319 1.912472 1.605013
1 week (168 h) 1.125680 0.633392 1.662069 1.127314 2.661345 1.965431
WS 6 h 0.616774 0.316972 0.912428 0.508295 1.136367 0.783146
12 h 0.715412 0.350974 0.956550 0.524247 1.261203 0.796744
1 day (24 h) 0.801851 0.343378 0.949427 0.446912 1.275665 0.608630
1 week (168 h) 0.950211 0.363124 1.088887 0.348541 1.494891 0.615371
PA 6 h 0.045046 0.034294 0.074856 0.061726 0.530665 0.441476
12 h 0.053359 0.041613 0.077328 0.058476 0.563603 0.427426
1 day (24 h) 0.059481 0.038666 0.079021 0.051491 0.556899 0.404451
1 week (168 h) 0.066325 0.047544 0.083628 0.053654 0.773143 0.384029
P 6 h 0.134093 0.274033 0.113173 0.315504 0.117710 0.305539
12 h 0.178871 0.295419 0.138729 0.297227 0.130442 0.281377
1 day (24 h) 0.206231 0.253588 0.165750 0.288432 0.158641 0.265257
1 week (168 h) 0.239885 0.173820 0.222682 0.201782 0.214975 0.197499
SWC 6 h 0.508379 0.487342 NaN NaN 1.313730 1.556829
12 h 0.664855 0.471849 NaN NaN 1.278001 1.323011
1 day (24 h) 0.779066 0.640996 NaN NaN 1.355740 1.472185
1 week (168 h) 1.493784 0.947799 NaN NaN 1.947605 1.488284
TS 6 h 0.341080 0.431992 NaN NaN 0.954469 0.889126
12 h 0.534363 0.783787 NaN NaN 1.002555 0.876784
1 day (24 h) 0.786670 0.851931 NaN NaN 1.078373 0.856964
1 week (168 h) 1.659875 1.077782 NaN NaN 1.440008 0.764040
t = the_table(results_Av, 'rmse_stand', y_name="Stand. RMSE")
the_table_latex(t, base_path_tbl / "the_table_stand.tex", stand = True, label="tbl:the_table_stand", 
                caption = "\\CapTheTableStand")
t
Kalman Filter ERA-I MDS
Stand. RMSE mean std mean std mean std
Variable Gap
TA 6 h 0.051164 0.032595 0.169965 0.125917 0.342294 0.239370
12 h 0.076576 0.050583 0.185712 0.113647 0.371303 0.220595
1 day (24 h) 0.093541 0.046497 0.193021 0.100984 0.380185 0.203330
1 week (168 h) 0.128790 0.056103 0.221378 0.081160 0.477006 0.165998
SW_IN 6 h 0.218804 0.198354 0.241826 0.324711 0.311450 0.418630
12 h 0.236052 0.166018 0.265721 0.243964 0.340325 0.337919
1 day (24 h) 0.277272 0.147267 0.323282 0.200637 0.425342 0.292171
1 week (168 h) 0.301873 0.126176 0.344233 0.170994 0.526387 0.262772
LW_IN 6 h 0.259855 0.184387 0.329028 0.309564 0.635910 0.358053
12 h 0.319900 0.184349 0.351964 0.299952 0.669407 0.320751
1 day (24 h) 0.347838 0.186865 0.335903 0.291448 0.705850 0.295949
1 week (168 h) 0.406688 0.153141 0.390071 0.265269 0.785460 0.210555
VPD 6 h 0.098019 0.083135 0.296855 0.354224 0.476967 0.492006
12 h 0.151227 0.115548 0.289627 0.295025 0.489108 0.479704
1 day (24 h) 0.189442 0.114910 0.285579 0.236314 0.437795 0.367413
1 week (168 h) 0.257686 0.144994 0.380474 0.258060 0.609224 0.449918
WS 6 h 0.379454 0.195008 0.561347 0.312715 0.699120 0.481810
12 h 0.440138 0.215927 0.588492 0.322529 0.775922 0.490176
1 day (24 h) 0.493318 0.211254 0.584110 0.274951 0.784819 0.374443
1 week (168 h) 0.584592 0.223403 0.669909 0.214431 0.919692 0.378591
PA 6 h 0.052675 0.040103 0.087534 0.072180 0.620545 0.516250
12 h 0.062397 0.048661 0.090425 0.068381 0.659061 0.499820
1 day (24 h) 0.069556 0.045215 0.092405 0.060212 0.651223 0.472953
1 week (168 h) 0.077558 0.055597 0.097793 0.062741 0.904092 0.449073
P 6 h 0.478431 0.977725 0.403790 1.125691 0.419979 1.090136
12 h 0.638197 1.054031 0.494974 1.060481 0.465404 1.003928
1 day (24 h) 0.735816 0.904779 0.591382 1.029100 0.566018 0.946414
1 week (168 h) 0.855891 0.620173 0.794512 0.719941 0.767011 0.704660
SWC 6 h 0.057037 0.054677 NaN NaN 0.147393 0.174667
12 h 0.074593 0.052939 NaN NaN 0.143384 0.148434
1 day (24 h) 0.087407 0.071916 NaN NaN 0.152106 0.165171
1 week (168 h) 0.167594 0.106338 NaN NaN 0.218510 0.166977
TS 6 h 0.060276 0.076342 NaN NaN 0.168674 0.157127
12 h 0.094433 0.138512 NaN NaN 0.177172 0.154946
1 day (24 h) 0.139021 0.150554 NaN NaN 0.190571 0.151443
1 week (168 h) 0.293335 0.190466 NaN NaN 0.254479 0.135022

Timeseries

@cache_disk(cache_dir / "the_results_ts")
def get_the_results_ts():
    reset_seed()
    comp_Av = ImpComparison(models = models_var, df = hai, control = hai_era, block_len = 446, time_series=True, rmse=False)
    results_Av = comp_Av.compare(gap_len = [12,24, 336], var=list(hai.columns), n_rep=4) 
    return results_Av

results_ts = get_the_results_ts()
ts = plot_timeseries(results_ts.query("var in ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS']"), idx_rep=0)
save_show_plot(ts, "timeseries_1", altair_render=True)

%time ts = plot_timeseries(results_ts.query("var in ['PA', 'P', 'TS', 'SWC']"), idx_rep=0)
%time save_show_plot(ts, "timeseries_2", altair_render=True)
CPU times: user 2.93 s, sys: 2.9 ms, total: 2.93 s
Wall time: 2.95 s
CPU times: user 8.44 s, sys: 74.3 ms, total: 8.52 s
Wall time: 12.3 s

from tqdm.auto import tqdm
# @cache_disk(cache_dir / "ts_plot", rm_cache=True)
def plot_additional_ts():
    for idx in tqdm(results_ts.idx_rep.unique()):
        if idx == 0: continue # skip first plot as is done above
        ts1 = plot_timeseries(results_ts.query("var in ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS']"), idx_rep=idx)
        save_show_plot(ts1, f"timeseries_1_{idx}", altair_render=True)
        ts2 = plot_timeseries(results_ts.query("var in ['PA', 'P', 'TS', 'SWC']"), idx_rep=idx)
        save_show_plot(ts2, f"timeseries_2_{idx}", altair_render=True)        
plot_additional_ts()

Kalman Filter analysis

Gap len

@cache_disk("gap_len")
def get_g_len(n_rep=n_rep):
    reset_seed()
    return KalmanImpComparison(models_var, hai, hai_era, block_len=48*7+100).compare(gap_len = [2,6,12,24,48,48*2, 48*3, 48*7], var=list(hai.columns), n_rep=n_rep)
gap_len = get_g_len(n_rep)
p = plot_gap_len(gap_len, hai, hai_era)
save_show_plot(p, "gap_len")

t = table_gap_len(gap_len)
table_gap_len_latex(t, base_path_tbl / "gap_len.tex", label="gap_len",
                caption="\\CapGapLen")
t
Gap 1 h 3 h 6 h 12 h 1 day (24 h) 2 days (48 h) 3 days (72 h) 1 week (168 h)
Variable Stand. RMSE
TA mean 0.025327 0.034172 0.055041 0.077722 0.107172 0.115631 0.119447 0.124932
std 0.022582 0.022085 0.037678 0.048111 0.059004 0.055879 0.052048 0.042619
SW_IN mean 0.141258 0.185323 0.219323 0.242353 0.283377 0.279351 0.286491 0.295492
std 0.164966 0.180712 0.192019 0.162735 0.139278 0.130424 0.125494 0.120383
LW_IN mean 0.122415 0.199663 0.251175 0.355663 0.363577 0.401382 0.399580 0.426373
std 0.159722 0.171639 0.181869 0.226522 0.188607 0.195204 0.185700 0.164282
VPD mean 0.044495 0.072448 0.106393 0.185943 0.203379 0.226881 0.238030 0.258977
std 0.050343 0.092523 0.093396 0.176841 0.135613 0.141761 0.133105 0.148695
WS mean 0.236801 0.305489 0.371760 0.473200 0.505951 0.551268 0.530790 0.564939
std 0.198199 0.193005 0.202284 0.273506 0.285058 0.238593 0.200557 0.195587
PA mean 0.026844 0.038072 0.055131 0.063885 0.068426 0.075262 0.074562 0.080174
std 0.026776 0.029629 0.065971 0.039537 0.050700 0.058926 0.055454 0.059972
P mean 0.246141 0.550198 0.443231 0.722287 0.693768 0.729318 0.850642 0.907571
std 0.764003 1.821625 0.525170 1.351510 0.782852 0.619818 0.749813 0.583107
SWC mean 0.019637 0.038882 0.053718 0.067853 0.087583 0.098537 0.113108 0.165704
std 0.016406 0.034985 0.036017 0.042192 0.062350 0.072180 0.086248 0.099118
TS mean 0.025595 0.044131 0.067155 0.105208 0.129654 0.185770 0.209769 0.315963
std 0.026281 0.042837 0.070520 0.144570 0.118676 0.147786 0.139939 0.218048
g_len_agg = gap_len.groupby('gap_len').agg({'rmse_stand': 'mean'})
(g_len_agg.iloc[0])/g_len_agg.iloc[-1]
rmse_stand    0.282954
dtype: float64
g_len_agg = gap_len.groupby(['gap_len', 'var']).agg({'rmse_stand': 'mean'})
(g_len_agg.loc[1.])/g_len_agg.loc[168.]
rmse_stand
var
TA 0.202725
SW_IN 0.478043
LW_IN 0.287107
VPD 0.171809
WS 0.419162
PA 0.334820
P 0.271208
SWC 0.118507
TS 0.081006
g_len_agg
rmse_stand
gap_len var
1 TA 0.025327
SW_IN 0.141258
LW_IN 0.122415
VPD 0.044495
WS 0.236801
... ... ...
168 WS 0.564939
PA 0.080174
P 0.907571
SWC 0.165704
TS 0.315963

72 rows × 1 columns

g_len_agg_std = gap_len.groupby('gap_len').agg({'rmse_stand': 'std'})
(g_len_agg_std.iloc[0])/g_len_agg_std.iloc[-1]
rmse_stand    0.849176
dtype: float64
(gap_len.groupby(['gap_len', 'var']).agg({'rmse_stand': 'std'})
    .unstack("var")
    .droplevel(0, 1) 
    .plot(subplots=True, layout=(3,3), figsize=(10,10)))
array([[<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
        <AxesSubplot: xlabel='gap_len'>],
       [<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
        <AxesSubplot: xlabel='gap_len'>],
       [<AxesSubplot: xlabel='gap_len'>, <AxesSubplot: xlabel='gap_len'>,
        <AxesSubplot: xlabel='gap_len'>]], dtype=object)

# with open(base_path_tbl / "gap_len.tex") as f:
    # print(f.readlines())

Control

models_nc = pd.DataFrame({'model': [ l_model("1_gap_varying_336_no_control_v1.pickle"), l_model("1_gap_varying_6-336_v3.pickle")],
                          'type':   [ 'No Control',                                       'Use Control'                         ]})                                        
@cache_disk("use_control")
def get_control(n_rep=n_rep):
    reset_seed()
    
    kcomp_control = KalmanImpComparison(models_nc, hai, hai_era, block_len=100+48*7)

    k_results_control = kcomp_control.compare(n_rep =n_rep, gap_len = [12, 24, 48, 48*7], var = list(hai.columns))
    
    return k_results_control
from time import sleep
k_results_control = get_control(n_rep)
k_results_control
var loss likelihood gap_len idx_rep type rmse rmse_stand gap_len_f
0 TA -5.742477 2.142377 6 0 No Control 0.227680 0.028731 6 h
1 TA -5.755538 2.148783 6 1 No Control 0.228037 0.028776 6 h
2 TA -1.232816 2.132641 6 2 No Control 1.813357 0.228826 6 h
3 TA -4.693027 2.118098 6 3 No Control 0.906512 0.114392 6 h
4 TA -5.179578 2.151546 6 4 No Control 0.726325 0.091654 6 h
... ... ... ... ... ... ... ... ... ...
995 TS 115.193441 0.966260 168 495 Use Control 1.770984 0.312970 1 week (168 h)
996 TS 82.913345 1.474181 168 496 Use Control 1.403469 0.248022 1 week (168 h)
997 TS 93.694508 1.154413 168 497 Use Control 1.526679 0.269796 1 week (168 h)
998 TS 118.712986 1.494637 168 498 Use Control 1.798757 0.317878 1 week (168 h)
999 TS 93.694508 1.154413 168 499 Use Control 1.526679 0.269796 1 week (168 h)

36000 rows × 9 columns

p = plot_compare(k_results_control, 'type', y = 'rmse', scale_domain=["Use Control", "No Control"])
save_show_plot(p, "use_control")
p
from functools import partial
t = table_compare(k_results_control, 'type')
table_compare_latex(t, base_path_tbl / "control.tex", label="tbl:control",
                caption="\\CapControl")
t
type Use Control No Control
Stand. RMSE mean std se mean std se diff.
Variable Gap
TA 6 h 0.092151 0.055814 0.002496 0.094594 0.059489 0.002660 0.002443
12 h 0.118358 0.074072 0.003313 0.160712 0.106870 0.004779 0.042354
1 day (24 h) 0.146578 0.076799 0.003435 0.232744 0.159137 0.007117 0.086166
1 week (168 h) 0.174324 0.064495 0.002884 0.294072 0.147849 0.006612 0.119748
SW_IN 6 h 0.255795 0.164694 0.007365 0.331166 0.270628 0.012103 0.075371
12 h 0.314755 0.164839 0.007372 0.518890 0.319771 0.014301 0.204135
1 day (24 h) 0.339310 0.128516 0.005747 0.608981 0.277150 0.012395 0.269671
1 week (168 h) 0.357922 0.103167 0.004614 0.693166 0.248412 0.011109 0.335244
LW_IN 6 h 0.299442 0.201432 0.009008 0.330486 0.185322 0.008288 0.031044
12 h 0.367537 0.197000 0.008810 0.485509 0.193452 0.008651 0.117972
1 day (24 h) 0.400412 0.175611 0.007854 0.561446 0.186706 0.008350 0.161035
1 week (168 h) 0.466036 0.141424 0.006325 0.666359 0.161485 0.007222 0.200323
VPD 6 h 0.137863 0.104531 0.004675 0.178626 0.139476 0.006238 0.040762
12 h 0.226616 0.183575 0.008210 0.320797 0.262730 0.011750 0.094181
1 day (24 h) 0.272840 0.206783 0.009248 0.407586 0.296989 0.013282 0.134746
1 week (168 h) 0.325113 0.154287 0.006900 0.499408 0.252086 0.011274 0.174294
WS 6 h 0.571539 0.523304 0.023403 0.429625 0.262009 0.011717 -0.141914
12 h 0.660302 0.408751 0.018280 0.680223 0.391765 0.017520 0.019921
1 day (24 h) 0.676472 0.374177 0.016734 0.767313 0.430367 0.019247 0.090841
1 week (168 h) 0.781636 0.264823 0.011843 0.888977 0.318094 0.014226 0.107340
PA 6 h 0.099325 0.066109 0.002956 0.144519 0.105120 0.004701 0.045194
12 h 0.116360 0.061026 0.002729 0.338753 0.209793 0.009382 0.222393
1 day (24 h) 0.136620 0.076087 0.003403 0.562996 0.378688 0.016935 0.426376
1 week (168 h) 0.158988 0.068315 0.003055 0.879748 0.439831 0.019670 0.720760
P 6 h 0.542198 0.990843 0.044312 0.507613 0.973715 0.043546 -0.034584
12 h 0.627167 1.034936 0.046284 0.605429 1.025064 0.045842 -0.021738
1 day (24 h) 0.628179 0.575428 0.025734 0.614481 0.583966 0.026116 -0.013698
1 week (168 h) 0.880507 0.572737 0.025614 0.860985 0.597007 0.026699 -0.019522
SWC 6 h 0.152144 0.107350 0.004801 0.151053 0.141370 0.006322 -0.001092
12 h 0.244768 0.154341 0.006902 0.305445 0.233699 0.010451 0.060678
1 day (24 h) 0.392553 0.222757 0.009962 0.538003 0.337738 0.015104 0.145450
1 week (168 h) 0.719138 0.314318 0.014057 0.842404 0.431371 0.019292 0.123267
TS 6 h 0.168111 0.114636 0.005127 0.118928 0.076226 0.003409 -0.049183
12 h 0.226302 0.134730 0.006025 0.203284 0.132946 0.005946 -0.023019
1 day (24 h) 0.286258 0.152198 0.006807 0.285367 0.174358 0.007798 -0.000891
1 week (168 h) 0.362279 0.176402 0.007889 0.359044 0.191313 0.008556 -0.003235

Gap in Other variables

models_gap_single = pd.DataFrame.from_records([
    {'Gap':  'All variables', 'gap_single_var': False, 'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle")},
    {'Gap':  'Only one var',  'gap_single_var': True,  'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle")},
])
@cache_disk("gap_single")
def get_gap_single(n_rep):
    kcomp_single = KalmanImpComparison(models_gap_single, hai, hai_era, block_len=130)

    return kcomp_single.compare(n_rep =n_rep, gap_len = [6, 12, 24, 30], var = list(hai.columns))
res_single = get_gap_single(n_rep)
p = plot_compare(res_single, "Gap", y = 'rmse', scale_domain=["Only one var", "All variables"])
save_show_plot(p, "gap_single_var")

t = table_compare(res_single, 'Gap')
table_compare_latex(t, base_path_tbl / "gap_single_var.tex", caption="\\CapGapSingle", label="tbl:gap_single_var")
t
Gap Only one var All variables
Stand. RMSE mean std se mean std se diff.
Variable Gap
TA 3 h 0.029116 0.026433 0.001182 0.048143 0.040335 0.001804 0.019027
6 h 0.042861 0.031811 0.001423 0.080906 0.062887 0.002812 0.038045
12 h 0.071487 0.049631 0.002220 0.124425 0.096532 0.004317 0.052939
15 h 0.079382 0.053998 0.002415 0.135941 0.087736 0.003924 0.056559
SW_IN 3 h 0.198721 0.204619 0.009151 0.196330 0.239872 0.010727 -0.002390
6 h 0.230639 0.190542 0.008521 0.247159 0.234288 0.010478 0.016520
12 h 0.268135 0.173156 0.007744 0.279729 0.220169 0.009846 0.011595
15 h 0.254110 0.137783 0.006162 0.275088 0.184996 0.008273 0.020978
LW_IN 3 h 0.198619 0.171501 0.007670 0.204255 0.192592 0.008613 0.005636
6 h 0.288557 0.205514 0.009191 0.285575 0.220568 0.009864 -0.002982
12 h 0.336802 0.211468 0.009457 0.346164 0.234498 0.010487 0.009362
15 h 0.339017 0.183203 0.008193 0.343338 0.211743 0.009469 0.004321
VPD 3 h 0.063849 0.064851 0.002900 0.096974 0.101109 0.004522 0.033125
6 h 0.096670 0.092415 0.004133 0.154050 0.152273 0.006810 0.057380
12 h 0.163176 0.134966 0.006036 0.235416 0.209535 0.009371 0.072241
15 h 0.170890 0.147536 0.006598 0.257427 0.230171 0.010294 0.086537
WS 3 h 0.299512 0.185594 0.008300 0.300374 0.185026 0.008275 0.000862
6 h 0.394661 0.234428 0.010484 0.396053 0.237782 0.010634 0.001392
12 h 0.479464 0.249523 0.011159 0.499147 0.264118 0.011812 0.019682
15 h 0.470032 0.218304 0.009763 0.481117 0.234884 0.010504 0.011084
PA 3 h 0.024250 0.015962 0.000714 0.028151 0.017252 0.000772 0.003902
6 h 0.036931 0.024932 0.001115 0.042390 0.031342 0.001402 0.005459
12 h 0.050980 0.025147 0.001125 0.055303 0.026073 0.001166 0.004323
15 h 0.054699 0.037828 0.001692 0.057890 0.042331 0.001893 0.003191
P 3 h 0.338024 0.665541 0.029764 0.322645 0.525352 0.023494 -0.015379
6 h 0.380737 0.804932 0.035998 0.405015 0.923454 0.041298 0.024277
12 h 0.428597 0.572176 0.025588 0.455575 0.680281 0.030423 0.026978
15 h 0.484958 1.048699 0.046899 0.507848 1.212676 0.054232 0.022889
SWC 3 h 0.012567 0.019060 0.000852 0.013736 0.023656 0.001058 0.001169
6 h 0.017016 0.026366 0.001179 0.021119 0.033282 0.001488 0.004103
12 h 0.029611 0.049173 0.002199 0.033421 0.048640 0.002175 0.003810
15 h 0.031970 0.045392 0.002030 0.039874 0.055206 0.002469 0.007904
TS 3 h 0.028479 0.024754 0.001107 0.032860 0.028891 0.001292 0.004381
6 h 0.045677 0.047974 0.002145 0.061144 0.057966 0.002592 0.015467
12 h 0.088208 0.091400 0.004088 0.114275 0.118003 0.005277 0.026068
15 h 0.101862 0.103646 0.004635 0.131989 0.130062 0.005817 0.030128
res_singl_perc = res_single.groupby(['Gap', 'var', 'gap_len']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'Gap', values='rmse_stand', index=['var', 'gap_len'])
pd.DataFrame({'Only one var': (res_singl_perc["All variables"] - res_singl_perc["Only one var"]) / res_singl_perc["All variables"] * 100})
Only one var
var gap_len
TA 3 39.521284
6 47.023566
12 42.546605
15 41.605866
SW_IN 3 -1.217559
6 6.684132
12 4.144939
15 7.625968
LW_IN 3 2.759433
6 -1.044157
12 2.704600
15 1.258656
VPD 3 34.158966
6 37.247869
12 30.686333
15 33.616012
WS 3 0.286928
6 0.351345
12 3.943177
15 2.303891
PA 3 13.860326
6 12.877279
12 7.816283
15 5.511460
P 3 -4.766508
6 5.994219
12 5.921732
15 4.507125
SWC 3 8.507126
6 19.428107
12 11.400826
15 19.823475
TS 3 13.331333
6 25.295932
12 22.811204
15 22.825776
res_singl_perc = res_single.groupby(['Gap', 'var']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'Gap', values='rmse_stand', index=['var'])
pd.DataFrame({'Only one var': (res_singl_perc["All variables"] - res_singl_perc["Only one var"]) / res_singl_perc["All variables"] * 100})
Only one var
var
TA 42.774328
SW_IN 4.678197
LW_IN 1.385379
VPD 33.511756
WS 1.969357
PA 9.183783
P 3.475042
SWC 15.706237
TS 22.347878

Generic vs Specialized

models_generic = models_var.copy()
models_generic.model = l_model("1_gap_varying_6-336_v3.pickle") 
models_generic['type'] = 'Generic'
models_generic
var model type
0 TA Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
1 SW_IN Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
2 LW_IN Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
3 VPD Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
4 WS Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
5 PA Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
6 P Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
7 TS Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
8 SWC Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
models_var['type'] = 'Fine-tuned one var'
models_gen_vs_spec = pd.concat([models_generic, models_var])
models_gen_vs_spec
var model type
0 TA Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
1 SW_IN Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
2 LW_IN Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
3 VPD Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
4 WS Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
5 PA Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
6 P Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
7 TS Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
8 SWC Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Generic
0 TA Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
1 SW_IN Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
2 LW_IN Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
3 VPD Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
4 WS Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
5 PA Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
6 P Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
7 TS Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
8 SWC Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14 Fine-tuned one var
@cache_disk("generic")
def get_generic(n_rep=n_rep):
    reset_seed()

    comp_generic = KalmanImpComparison(models_gen_vs_spec, hai, hai_era, block_len=100+48*7)

    return comp_generic.compare(n_rep =n_rep, gap_len = [12, 24, 48, 48*7], var = list(hai.columns))
k_results_generic = get_generic(n_rep)
plot_formatter.legend_symbol_size = 300
p = plot_compare(k_results_generic, 'type', y = 'rmse', scale_domain=["Fine-tuned one var", "Generic"])
save_show_plot(p, "generic")
p
t = table_compare(k_results_generic, 'type')
table_compare_latex(t, base_path_tbl / "generic.tex", label='tbl:generic', caption="\\CapGeneric")
t
type Generic Fine-tuned one var
Stand. RMSE mean std se mean std se diff.
Variable Gap
TA 6 h 0.092151 0.055814 0.002496 0.054999 0.032663 0.001461 -0.037153
12 h 0.118358 0.074072 0.003313 0.077345 0.052042 0.002327 -0.041013
1 day (24 h) 0.146578 0.076799 0.003435 0.100970 0.058986 0.002638 -0.045608
1 week (168 h) 0.174324 0.064495 0.002884 0.128855 0.048376 0.002163 -0.045469
SW_IN 6 h 0.255795 0.164694 0.007365 0.208728 0.167255 0.007480 -0.047067
12 h 0.314755 0.164839 0.007372 0.257761 0.163914 0.007330 -0.056994
1 day (24 h) 0.339310 0.128516 0.005747 0.280578 0.131607 0.005886 -0.058733
1 week (168 h) 0.357922 0.103167 0.004614 0.297530 0.118822 0.005314 -0.060391
LW_IN 6 h 0.299442 0.201432 0.009008 0.256844 0.199753 0.008933 -0.042598
12 h 0.367537 0.197000 0.008810 0.326104 0.209362 0.009363 -0.041433
1 day (24 h) 0.400412 0.175611 0.007854 0.358755 0.185939 0.008315 -0.041657
1 week (168 h) 0.466036 0.141424 0.006325 0.405833 0.165308 0.007393 -0.060203
VPD 6 h 0.137863 0.104531 0.004675 0.096682 0.089465 0.004001 -0.041182
12 h 0.226616 0.183575 0.008210 0.169084 0.164870 0.007373 -0.057532
1 day (24 h) 0.272840 0.206783 0.009248 0.202490 0.159921 0.007152 -0.070350
1 week (168 h) 0.325113 0.154287 0.006900 0.263913 0.153488 0.006864 -0.061201
WS 6 h 0.571539 0.523304 0.023403 0.365262 0.200693 0.008975 -0.206276
12 h 0.660302 0.408751 0.018280 0.481919 0.265006 0.011851 -0.178383
1 day (24 h) 0.676472 0.374177 0.016734 0.510818 0.270480 0.012096 -0.165653
1 week (168 h) 0.781636 0.264823 0.011843 0.569137 0.189984 0.008496 -0.212499
PA 6 h 0.099325 0.066109 0.002956 0.054017 0.032223 0.001441 -0.045308
12 h 0.116360 0.061026 0.002729 0.059966 0.030824 0.001378 -0.056394
1 day (24 h) 0.136620 0.076087 0.003403 0.072074 0.068788 0.003076 -0.064546
1 week (168 h) 0.158988 0.068315 0.003055 0.081116 0.053864 0.002409 -0.077872
P 6 h 0.542198 0.990843 0.044312 0.542198 0.990843 0.044312 0.000000
12 h 0.627167 1.034936 0.046284 0.627167 1.034936 0.046284 0.000000
1 day (24 h) 0.628179 0.575428 0.025734 0.628179 0.575428 0.025734 0.000000
1 week (168 h) 0.880507 0.572737 0.025614 0.880507 0.572737 0.025614 0.000000
SWC 6 h 0.152144 0.107350 0.004801 0.050597 0.033592 0.001502 -0.101547
12 h 0.244768 0.154341 0.006902 0.067366 0.043308 0.001937 -0.177402
1 day (24 h) 0.392553 0.222757 0.009962 0.078784 0.053435 0.002390 -0.313769
1 week (168 h) 0.719138 0.314318 0.014057 0.169313 0.103684 0.004637 -0.549825
TS 6 h 0.168111 0.114636 0.005127 0.065352 0.068001 0.003041 -0.102759
12 h 0.226302 0.134730 0.006025 0.088370 0.087491 0.003913 -0.137932
1 day (24 h) 0.286258 0.152198 0.006807 0.129457 0.127674 0.005710 -0.156801
1 week (168 h) 0.362279 0.176402 0.007889 0.316745 0.217719 0.009737 -0.045534
res_singl_perc = k_results_generic.groupby(['type', 'var']).agg({'rmse_stand': 'mean'}).reset_index().pivot(columns = 'type', values='rmse_stand', index=['var'])
(res_singl_perc["Generic"] - res_singl_perc["Fine-tuned one var"]) / res_singl_perc["Generic"] * 100
var
TA       31.847749
SW_IN    17.604408
LW_IN    12.122588
VPD      23.925198
WS       28.357855
PA       47.745394
P         0.000000
SWC      75.735168
TS       42.478176
dtype: float64

Training

models_train = pd.DataFrame.from_records([
    # {'Train':  'All variables',  'model': l_model("All_gap_all_30_v1.pickle")  },
    {'Train':  'Only one var',   'model': l_model("1_gap_varying_6-336_v3.pickle")  },
    {'Train':  'Multi vars',     'model': l_model("all_varying_gap_varying_len_6-30_v3.pickle")  },
    {'Train':  'Random params',  'model': l_model("rand_all_varying_gap_varying_len_6-30_v4.pickle")  }
])
models_train
Train model
0 Only one var Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14
1 Multi vars Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14
2 Random params Kalman Filter\n N dim obs: 9,\n N dim state: 18,\n N dim contr: 14
@cache_disk("train")
def get_train(n_rep):
    reset_seed()
    kcomp = KalmanImpComparison(models_train, hai, hai_era, block_len=130)

    return kcomp.compare(n_rep =n_rep, gap_len = [6, 12, 24, 30], var = list(hai.columns))
res_train = get_train(n_rep)
res_train_agg = res_train.groupby(['Train', 'gap_len']).agg({'rmse_stand': 'mean'}).reset_index()
res_train_agg
Train gap_len rmse_stand
0 Multi vars 3 0.131760
1 Multi vars 6 0.166202
2 Multi vars 12 0.215978
3 Multi vars 15 0.225579
4 Only one var 3 0.192683
5 Only one var 6 0.250933
6 Only one var 12 0.332169
7 Only one var 15 0.340622
8 Random params 3 0.200750
9 Random params 6 0.236855
10 Random params 12 0.292697
11 Random params 15 0.294936
p = plot_compare(res_train, "Train", y='rmse', scale_domain=["Multi vars", "Only one var", "Random params"])
save_show_plot(p, "train_compare")

t = table_compare3(res_train, 'Train')
table_compare3_latex(t, base_path_tbl / "train.tex", label="tbl:train_compare", caption="\\CapTrain")
t
Train Multi vars Only one var Random params
Stand. RMSE mean std mean std mean std
Variable Gap [$h$]
TA 3 h 0.028752 0.023775 0.066495 0.049728 0.060459 0.045142
6 h 0.044355 0.032665 0.092593 0.062233 0.079903 0.055433
12 h 0.072632 0.049516 0.133494 0.091525 0.117427 0.085482
15 h 0.077183 0.051459 0.130566 0.071501 0.111204 0.067554
SW_IN 3 h 0.178118 0.175674 0.201927 0.178423 0.259459 0.219808
6 h 0.210377 0.170297 0.249492 0.175702 0.282919 0.209136
12 h 0.268875 0.172214 0.307884 0.167333 0.333024 0.192553
15 h 0.269610 0.165685 0.315006 0.164416 0.333722 0.187786
LW_IN 3 h 0.200445 0.169263 0.215081 0.177105 0.298262 0.202726
6 h 0.269851 0.215872 0.296780 0.229277 0.341691 0.228729
12 h 0.331508 0.224076 0.376745 0.241035 0.398407 0.259110
15 h 0.356784 0.208411 0.389550 0.207828 0.398491 0.224671
VPD 3 h 0.064145 0.060110 0.097167 0.079462 0.150077 0.127847
6 h 0.098231 0.096103 0.147885 0.112805 0.193383 0.174094
12 h 0.168481 0.155672 0.222713 0.160915 0.240388 0.192226
15 h 0.185879 0.158725 0.241190 0.167722 0.254005 0.210471
WS 3 h 0.305451 0.182717 0.471322 0.459590 0.447337 0.299554
6 h 0.371935 0.210461 0.575585 0.465884 0.481752 0.308642
12 h 0.429368 0.195014 0.649132 0.481338 0.523181 0.269521
15 h 0.478824 0.230152 0.690806 0.456658 0.540468 0.292931
PA 3 h 0.024824 0.018937 0.073198 0.061610 0.067156 0.056764
6 h 0.035399 0.020712 0.094714 0.061088 0.097657 0.076961
12 h 0.051464 0.045255 0.117277 0.073159 0.118236 0.089099
15 h 0.058828 0.039719 0.127520 0.079590 0.114797 0.076361
P 3 h 0.348034 0.680361 0.411050 1.509773 0.370699 0.713138
6 h 0.399553 0.617684 0.486592 0.718194 0.425397 0.704196
12 h 0.511262 0.851207 0.706403 1.150406 0.535496 0.980554
15 h 0.453258 0.597555 0.618186 0.755660 0.469612 0.698131
SWC 3 h 0.012302 0.029214 0.095440 0.075541 0.106073 0.074970
6 h 0.018203 0.030712 0.150918 0.108401 0.153282 0.106928
12 h 0.027391 0.036781 0.245083 0.160379 0.246170 0.215060
15 h 0.033106 0.041433 0.282246 0.227894 0.278409 0.208012
TS 3 h 0.023765 0.017708 0.102470 0.082133 0.047228 0.037464
6 h 0.047910 0.048379 0.163837 0.105534 0.075711 0.073902
12 h 0.082818 0.086185 0.230788 0.157936 0.121944 0.134360
15 h 0.116737 0.128579 0.270524 0.190755 0.153716 0.160674

Extra results

Standard deviations

hai_std = hai.std().to_frame(name='std')
hai_std.index.name = "Variable"
hai_std = hai_std.reset_index().assign(unit=[f"\\si{{{unit}}}" for unit in units_big.values()])
hai_std
Variable std unit
0 TA 7.924611 \si{°C}
1 SW_IN 204.002561 \si{W m-2}
2 LW_IN 41.955741 \si{hPa}
3 VPD 4.368417 \si{hPa}
4 WS 1.625425 \si{mm}
5 PA 0.855159 \si{m s-1}
6 P 0.280276 \si{W m-2}
7 SWC 8.913119 \si{%}
8 TS 5.658643 \si{°C}
latex = hai_std.style.hide(axis="index").format(precision=3).to_latex(hrules=True, caption="\\CapStd", label="tbl:hai_std", position_float="centering")

with open(base_path_tbl / "hai_std.tex", 'w') as f:
    f.write(latex)

Gap distribution

out_dir = here("../fluxnet/gap_stat")
site_info = pl.read_parquet(out_dir / "../site_info.parquet").select([
    pl.col("start").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
    pl.col("end").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
    pl.col("site").cast(pl.Categorical).sort()
])
def duration_n_obs(duration):
    "converts a duration into a n of fluxnet observations"
    return abs(int(duration.total_seconds() / (30 * 60)))
files = out_dir.ls()
files.sort() # need to sort to match the site_info
sites = []
for i, path in enumerate(files):
    sites.append(pl.scan_parquet(path).with_columns([
        pl.lit(site_info[i, "site"]).alias("site"),
        pl.lit(duration_n_obs(site_info[i, "start"] -  site_info[i, "end"])).alias("total_obs"),
        pl.col("TIMESTAMP_END").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M").alias("end"),
    ]).drop("TIMESTAMP_END"))

gap_stat = pl.concat(sites)
pl.read_parquet(files[0])
shape: (31574, 3) TIMESTAMP_END gap_len variable i64 u32 str 200901010030 16992 "TA_F_MDS_QC" 200912211100 5 "TA_F_MDS_QC" 200912211700 1 "TA_F_MDS_QC" 201001061300 1 "TA_F_MDS_QC" 201001071300 3 "TA_F_MDS_QC" 201001081130 2 "TA_F_MDS_QC" 201001081830 1 "TA_F_MDS_QC" 201001091000 2 "TA_F_MDS_QC" 201001191000 1 "TA_F_MDS_QC" 201001191130 2 "TA_F_MDS_QC" 201001291900 1 "TA_F_MDS_QC" 201002131030 2 "TA_F_MDS_QC" ... ... ... 201103241630 3 "NEE_VUT_95_QC" 201103241930 29 "NEE_VUT_95_QC" 201103251900 8 "NEE_VUT_95_QC" 201103260030 1 "NEE_VUT_95_QC" 201103260200 1 "NEE_VUT_95_QC" 201103260300 1 "NEE_VUT_95_QC" 201103261230 1 "NEE_VUT_95_QC" 201103261330 2 "NEE_VUT_95_QC" 201103261630 1 "NEE_VUT_95_QC" 201103261930 2 "NEE_VUT_95_QC" 201103262100 1 "NEE_VUT_95_QC" 201103270000 13441 "NEE_VUT_95_QC"
gap_stat.head().collect()
shape: (5, 5) gap_len variable site total_obs end u32 str str i32 datetime[μs] 16992 "TA_F_MDS_QC" "AR-SLu" 52559 2009-01-01 00:30:00 5 "TA_F_MDS_QC" "AR-SLu" 52559 2009-12-21 11:00:00 1 "TA_F_MDS_QC" "AR-SLu" 52559 2009-12-21 17:00:00 1 "TA_F_MDS_QC" "AR-SLu" 52559 2010-01-06 13:00:00 3 "TA_F_MDS_QC" "AR-SLu" 52559 2010-01-07 13:00:00
def plot_var_dist(var, small=False, ax=None):
    if ax is None: ax = get_grid(1)[0]
    ta_gaps = gap_stat.filter(
        (pl.col("variable") == var)
    ).filter(
        pl.col("gap_len") < 200 if small else True
    ).with_column(pl.col("gap_len") / (24 *2 * 7)).collect().to_pandas().hist("gap_len", bins=50, ax=ax)
    ax.set_title(f"{var} - { 'gaps < 200' if small else 'all gaps'}")
    if not small: ax.set_yscale('log')
    ax.set_xlabel("gap length (weeks)")
    ax.set_ylabel(f"{'Log' if not small else ''} n gaps")
    # plt.xscale('log') 
plot_var_dist('TA_F_QC')

color_map = dict(zip(scale_meteo.domain, list(sns.color_palette('Set2', n_colors=len(hai.columns)).as_hex())))
qc_map = {
    'TA': 'TA_F_QC',
    'SW_IN': 'SW_IN_F_QC',
    'LW_IN': 'LW_IN_F_QC',
    'VPD': 'VPD_F_QC',
    'WS': 'WS_F_QC',
    'PA': 'PA_F_QC',
    'P': 'P_F_QC',
    'TS': 'TS_F_MDS_1_QC',
    'SWC': 'SWC_F_MDS_1_QC',
}
def pl_in(col, values):
    expr = False
    for val in values:
        expr |= pl.col(col) == val
    return expr
gap_stat.filter(pl_in('variable', qc_map.values())
               ).with_columns([
    pl.when(pl.col("gap_len") < 48*7).then(True).otherwise(False).alias("short"),
    pl.count().alias("total"),
    pl.count().alias("total len"),
]).groupby("short").agg([
    (pl.col("gap_len").count() / pl.col("total")).alias("frac_num"),
    (pl.col("gap_len").sum() / pl.col("total len")).alias("frac_len")
]).collect()
shape: (2, 3) short frac_num frac_len bool list[f64] list[f64] false [0.010775, 0.010775, ... 0.010775] [63.844386, 63.844386, ... 63.844386] true [0.989225, 0.989225, ... 0.989225] [7.261655, 7.261655, ... 7.261655]
gap_stat.filter(pl_in('variable', qc_map.values())
               ).with_columns([
    pl.when(pl.col("gap_len") < 48*7).then(True).otherwise(False).alias("short"),
    pl.count().alias("total"),
    pl.count().alias("total len"),
]).groupby("short").agg([
    (pl.col("gap_len").count() / pl.col("total")).alias("frac_num"),
    (pl.col("gap_len").sum() / pl.col("total len")).alias("frac_len")
]).collect()
shape: (2, 3) short frac_num frac_len bool list[f64] list[f64] false [0.010775, 0.010775, ... 0.010775] [63.844386, 63.844386, ... 63.844386] true [0.989225, 0.989225, ... 0.989225] [7.261655, 7.261655, ... 7.261655]
frac_miss = gap_stat.filter(
    pl_in('variable', qc_map.values())
).groupby(["site", "variable"]).agg([
    pl.col("gap_len").mean().alias("mean"),
    (pl.col("gap_len").sum() / pl.col("total_obs").first()).alias("frac_gap")
])
frac_miss.groupby('variable').agg([
    pl.col("frac_gap").max().alias("max"),
    pl.col("frac_gap").min().alias("min"),
    pl.col("frac_gap").std().alias("std"),
    pl.col("frac_gap").mean().alias("mean"),
]).collect()
shape: (9, 5) variable max min std mean str f64 f64 f64 f64 "SW_IN_F_QC" 2.193984 0.000038 0.21952 0.16772 "PA_F_QC" 2.675381 0.000011 0.420542 0.387658 "VPD_F_QC" 2.298799 0.000011 0.297668 0.245457 "SWC_F_MDS_1_QC... 1.487143 0.000011 0.29985 0.314847 "TS_F_MDS_1_QC" 3.090642 0.000011 0.38542 0.282106 "WS_F_QC" 2.675324 0.000798 0.323944 0.257303 "TA_F_QC" 2.299027 0.000011 0.256043 0.198846 "P_F_QC" 1.503296 0.000011 0.330422 0.311598 "LW_IN_F_QC" 7.074603 0.000019 0.704469 0.578246
frac_miss.sort("frac_gap", reverse=True).collect()
shape: (1798, 4) site variable mean frac_gap str str f64 f64 "US-LWW" "LW_IN_F_QC" 11267.590909 7.074603 "US-Wi5" "LW_IN_F_QC" 70128.0 3.992031 "ES-LgS" "LW_IN_F_QC" 175344.0 3.333093 "US-LWW" "TS_F_MDS_1_QC" 1320.646341 3.090642 "US-Wi5" "TS_F_MDS_1_QC" 62.142857 2.897194 "US-Wi0" "LW_IN_F_QC" 1369.694444 2.814601 "US-ORv" "PA_F_QC" 3.899334 2.675381 "US-ORv" "WS_F_QC" 3.899251 2.675324 "US-LWW" "PA_F_QC" 409.701754 2.665944 "US-Wi5" "WS_F_QC" 39.57814 2.349861 "US-Wi5" "PA_F_QC" 51.453972 2.322707 "US-Wi5" "TA_F_QC" 45.790249 2.299027 ... ... ... ... "CN-Ha2" "TS_F_MDS_1_QC" 1.0 0.000019 "CN-Qia" "TS_F_MDS_1_QC" 1.0 0.000019 "CN-Qia" "SWC_F_MDS_1_QC... 1.0 0.000019 "FI-Lom" "P_F_QC" 1.0 0.000019 "CN-Ha2" "TA_F_QC" 1.0 0.000019 "CN-Qia" "LW_IN_F_QC" 1.0 0.000019 "US-Me6" "TS_F_MDS_1_QC" 1.0 0.000011 "US-Me6" "PA_F_QC" 1.0 0.000011 "US-Me6" "SWC_F_MDS_1_QC... 1.0 0.000011 "US-Me6" "TA_F_QC" 1.0 0.000011 "US-Me6" "P_F_QC" 1.0 0.000011 "US-Me6" "VPD_F_QC" 1.0 0.000011
site_info.filter((pl.col("site") == "US-LWW"))
shape: (1, 3) start end site datetime[μs] datetime[μs] cat 1997-01-01 00:30:00 1999-01-01 00:00:00 "US-LWW"
gap_stat.filter((pl.col("site") == "US-LWW") & (pl.col("variable") == "LW_IN_F_QC" )).collect()
shape: (22, 5) gap_len variable site total_obs end u32 str str i32 datetime[μs] 246556 "LW_IN_F_QC" "US-LWW" 35039 2000-01-01 00:30:00 19 "LW_IN_F_QC" "US-LWW" 35039 2014-02-11 08:30:00 6 "LW_IN_F_QC" "US-LWW" 35039 2014-03-08 08:00:00 1 "LW_IN_F_QC" "US-LWW" 35039 2014-03-08 11:30:00 1 "LW_IN_F_QC" "US-LWW" 35039 2014-03-08 13:00:00 50 "LW_IN_F_QC" "US-LWW" 35039 2014-11-18 13:00:00 21 "LW_IN_F_QC" "US-LWW" 35039 2014-11-19 22:30:00 22 "LW_IN_F_QC" "US-LWW" 35039 2014-11-24 23:30:00 55 "LW_IN_F_QC" "US-LWW" 35039 2014-11-26 06:30:00 172 "LW_IN_F_QC" "US-LWW" 35039 2014-11-27 19:30:00 70 "LW_IN_F_QC" "US-LWW" 35039 2014-12-01 23:00:00 3 "LW_IN_F_QC" "US-LWW" 35039 2014-12-03 11:30:00 35 "LW_IN_F_QC" "US-LWW" 35039 2014-12-03 16:30:00 92 "LW_IN_F_QC" "US-LWW" 35039 2014-12-04 11:00:00 14 "LW_IN_F_QC" "US-LWW" 35039 2014-12-11 05:00:00 36 "LW_IN_F_QC" "US-LWW" 35039 2014-12-11 18:00:00 36 "LW_IN_F_QC" "US-LWW" 35039 2014-12-12 17:30:00 87 "LW_IN_F_QC" "US-LWW" 35039 2014-12-13 18:00:00 70 "LW_IN_F_QC" "US-LWW" 35039 2014-12-16 01:30:00 37 "LW_IN_F_QC" "US-LWW" 35039 2014-12-17 16:00:00 10 "LW_IN_F_QC" "US-LWW" 35039 2014-12-21 07:30:00 494 "LW_IN_F_QC" "US-LWW" 35039 2014-12-21 17:30:00
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
sns.set_style("whitegrid")
def plot_var_dist(var, ax=None, small=True):
    if ax is None: ax = get_grid(1)[0]
    
    color = color_map[var]
    var_qc = qc_map[var]
    ta_gaps = gap_stat.filter(
        (pl.col("variable") == var_qc)
    ).filter(
        pl.col("gap_len") < (24 * 2 *7) if small else True 
    ).with_column(pl.col("gap_len") / (2 if small else 48 * 7)
                 ).collect().to_pandas().hist("gap_len", bins=50, ax=ax, edgecolor="white", color=color)
    ax.set_title(f"{var} - { 'gap length <  1 week' if small else 'all gaps'}")
    ax.set_xlabel(f"Gap length ({ 'hour' if small else 'week'})")
    ax.set_ylabel(f"Log n gaps")
    ax.set_yscale('log') 
    plt.tight_layout()
vars = gap_stat.select(pl.col("variable").unique()).collect()
vars.filter(pl.col("variable").str.contains("TA"))
shape: (4, 1) variable str "NEE_VUT_USTAR5... "TA_F_MDS_QC" "NEE_CUT_USTAR5... "TA_F_QC"
for ax, var in zip(get_grid(9,3,3, figsize=(15,12), sharey=False), list(var_type.categories)):
    plot_var_dist(var, ax=ax)
plt.savefig(base_path_img / "gap_len_dist_small.pdf")

for ax, var in zip(get_grid(9,3,3, figsize=(15,12), sharey=False), list(var_type.categories)):
    plot_var_dist(var, ax=ax, small=False)
plt.savefig(base_path_img / "gap_len_dist.pdf")

Square Root Filter

Numerical Stability

from meteo_imp.kalman.performance import *
err = cache_disk(cache_dir / "fuzz_sr")(fuzz_filter_SR)(100, 110) # this is already handling the random seed
p = plot_err_sr_filter(err)
save_show_plot(p, "numerical_stability")

Performance

@cache_disk(cache_dir / "perf_sr")
def get_perf_sr():
    reset_seed()
    return perf_comb_params('filter', use_sr_filter=[True, False], rep=range(100),
                           n_obs = 100,
                           n_dim_obs=9,
                           n_dim_state=18,
                           n_dim_contr=14,
                           p_missing=0,
                           bs=20 ,
                           init_method = 'local_slope'
                           ) 
perf1 = (get_perf_sr()
    .groupby('use_sr_filter')
    .agg(pl.col("time").mean())
    .with_column(
        pl.when(pl.col("use_sr_filter"))
        .then(pl.lit("Square Root Filter"))
        .otherwise(pl.lit("Standard Filter"))
        .alias("Filter type")
    ))
perf1
shape: (2, 3) use_sr_filter time Filter type bool f64 str true 0.231654 "Square Root Fi... false 0.150772 "Standard Filte...
(perf1[0, 'time'] - perf1[1, 'time']) / perf1[1, 'time'] * 100
53.645095282102254
plot_perf_sr = alt.Chart(perf1.to_pandas()).mark_bar(size = 50).encode(
    x=alt.X('Filter type', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('time', scale=alt.Scale(zero=False), title="time [s]"),
    color=alt.Color('Filter type',
                    scale = alt.Scale(scheme = 'accent'))
).properties(width=300)
save_show_plot(plot_perf_sr, "perf_sr")